import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import json
import os
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.patches as mpatches
import umap
import numpy as np
from CryptoFraudDetection.utils import embedding
from CryptoFraudDetection.utils import enums
from CryptoFraudDetection.utils import logger
LOGGER = logger.Logger(name=__name__, level=enums.LoggerMode.INFO, log_dir="../logs")
df = pd.read_parquet("../data/processed/x_posts.parquet")
df.head(5)
| username | tweet | timestamp | likes | impressions | comments | reposts | bookmarks | searchkeyword | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | @officialmcafee | Bitcoin now at $16,600.00. Those of you in the... | 2017-12-08T01:09:27.000Z | 6200.0 | 0.0 | 1259.0 | 4518.0 | 486.0 | Bitcoin |
| 1 | @CharlieShrem | Time to buy #bitcoin | 2018-02-01T16:37:51.000Z | 17000.0 | 0.0 | 469.0 | 6858.0 | 16.0 | Bitcoin |
| 2 | @rogerkver | I just bought 50 Bitcoin (BCH) ATM machines fo... | 2018-02-13T00:53:30.000Z | 2000.0 | 0.0 | 778.0 | 701.0 | 2.0 | Bitcoin |
| 3 | @rogerkver | Who wants a Bitcoin Cash Visa debit card? htt... | 2017-12-11T16:12:51.000Z | 4300.0 | 0.0 | 765.0 | 1929.0 | 2.0 | Bitcoin |
| 4 | @ARealHyena | I liked a \n@YouTube\n video http://youtu.be/i... | 2018-02-27T23:59:37.000Z | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Bitcoin |
df["searchkeyword"].value_counts()
searchkeyword Bitcoin 4405 Ethereum 3075 Chainlink 1912 thorchain 1640 $Atom 1558 Bitforex 1543 Terra Luna 1383 $Avax 1341 $FTT 1181 Safemoon 854 $STA 372 Beercoin 270 Teddy Doge 47 Name: count, dtype: int64
#sum of na in every col
df.isna().sum()
username 0 tweet 0 timestamp 0 likes 0 impressions 0 comments 0 reposts 0 bookmarks 0 searchkeyword 0 dtype: int64
#empty strings in every col
(df == "").sum()
username 4 tweet 272 timestamp 0 likes 0 impressions 0 comments 0 reposts 0 bookmarks 0 searchkeyword 0 dtype: int64
#pritn every user with empty string in username
df[df["username"] == ""]
| username | tweet | timestamp | likes | impressions | comments | reposts | bookmarks | searchkeyword | |
|---|---|---|---|---|---|---|---|---|---|
| 3369 | . \n@beeple\n fully unicode spec compliant, th... | 2021-08-26T03:30:24.000Z | 8.0 | 0.0 | 0.0 | 2.0 | 0.0 | Ethereum | |
| 7521 | you're not an investor, you're a tokenholder | 2023-10-27T20:58:19.000Z | 0.0 | 20.0 | 0.0 | 0.0 | 0.0 | Chainlink | |
| 16925 | 2023-07-14T02:29:05.000Z | 0.0 | 15.0 | 0.0 | 0.0 | 0.0 | Chainlink | ||
| 16992 | staking v0.2 has semi-slashing commence bull t... | 2023-09-04T05:24:02.000Z | 1.0 | 234.0 | 2.0 | 0.0 | 0.0 | Chainlink |
#print empty tweets
df[df["tweet"] == ""]
| username | tweet | timestamp | likes | impressions | comments | reposts | bookmarks | searchkeyword | |
|---|---|---|---|---|---|---|---|---|---|
| 389 | @2357_is_prime | 2019-09-16T17:50:04.000Z | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Bitcoin | |
| 472 | @cryptorick_ | 2019-05-05T09:53:01.000Z | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | Bitcoin | |
| 486 | @PayneFullHuman | 2019-05-19T13:39:31.000Z | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | Bitcoin | |
| 617 | @BitcoinWanda | 2020-11-07T10:43:52.000Z | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | Bitcoin | |
| 776 | @Bitcoin21oooooo | 2022-10-25T14:05:39.000Z | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | Bitcoin | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17913 | @LiamSolo42 | 2024-08-28T18:39:59.000Z | 0.0 | 49.0 | 0.0 | 0.0 | 0.0 | thorchain | |
| 17928 | @f1uffypaws | 2022-10-22T21:32:53.000Z | 0.0 | 38.0 | 0.0 | 0.0 | 0.0 | thorchain | |
| 17959 | @OliverLaFarge | 2024-01-05T15:12:26.000Z | 1.0 | 132.0 | 0.0 | 0.0 | 0.0 | thorchain | |
| 17985 | @matty_dot_thor | 2024-09-24T02:56:04.000Z | 2.0 | 75.0 | 0.0 | 0.0 | 0.0 | thorchain | |
| 19064 | @bitforexcom | 2023-10-06T07:33:57.000Z | 0.0 | 5.0 | 0.0 | 0.0 | 0.0 | Bitforex |
272 rows × 9 columns
empty_string_counts = df[df['tweet'] == ''].groupby('searchkeyword').size()
for keyword, count in empty_string_counts.items():
print(f"{keyword}: {count} leere Strings im Tweet-Feld")
Beercoin: 2 leere Strings im Tweet-Feld Bitcoin: 51 leere Strings im Tweet-Feld Bitforex: 2 leere Strings im Tweet-Feld Chainlink: 81 leere Strings im Tweet-Feld Ethereum: 13 leere Strings im Tweet-Feld Safemoon: 64 leere Strings im Tweet-Feld Terra Luna: 9 leere Strings im Tweet-Feld thorchain: 50 leere Strings im Tweet-Feld
#delete empty strings in tweet
df = df[df["tweet"] != ""]
value_counts = df['searchkeyword'].value_counts().reset_index()
value_counts.columns = ['searchkeyword', 'count']
plt.style.use('dark_background')
sns.set_theme(style="dark")
plt.figure(figsize=(12, 6))
ax = sns.barplot(
data=value_counts,
x='searchkeyword',
y='count',
hue='searchkeyword',
dodge=False,
palette='viridis',
legend=False
)
plt.xlabel('Search Keyword', color='white')
plt.ylabel('Anzahl', color='white')
plt.title('Count of Search Keywords', color='white')
plt.xticks(rotation=45, ha='right', color='white')
plt.yticks(color='white')
ax.set_facecolor('black')
fig = plt.gcf()
fig.patch.set_facecolor('black')
for spine in ax.spines.values():
spine.set_color('white')
ax.tick_params(colors='white', which='both')
ax.grid(True, color='gray', linestyle='--', linewidth=0.5)
for p in ax.patches:
height = p.get_height()
ax.text(
x=p.get_x() + p.get_width() / 2,
y=height + 0.02 * max(value_counts['count']),
s=f'{int(height)}',
ha='center',
color='white',
fontsize=10
)
plt.tight_layout()
plt.show()
This plot shows the count of Tweets per Coin in the scraped X Dataset. Some Coins have not that much Tweets, but thats because they also have a shorter Price-Timeseries which we use as start and enddate for scraping.
output_file = "../data/processed/x_posts_embeddings.parquet"
tqdm.pandas(desc="Embedding Progress")
embedder = embedding.Embedder(LOGGER)
def generate_embeddings(df):
tweets = df['tweet'].tolist()
embeddings = embedder.embed(tweets)
df['embedding'] = embeddings
return df
if not os.path.exists(output_file):
print("Die Datei existiert nicht. Berechne Embeddings...")
df = generate_embeddings(df)
keyword_to_coin = {
'Bitcoin': 'Bitcoin',
'Ethereum': 'Ethereum',
'Chainlink': 'Chainlink',
'thorchain': 'THORChain',
'$Atom': 'Cosmos',
'Bitforex': 'BitForex',
'$Avax': 'Avalanche',
'Terra Luna': 'Terra Luna',
'$FTT': 'FTX Token',
'Safemoon': 'Safe Moon',
'$STA': 'STOA Network',
'Beercoin': 'BeerCoin',
'Teddy Doge': 'Teddy Doge'
}
# Werte in der Spalte 'searchkeyword' ersetzen
df['searchkeyword'] = df['searchkeyword'].replace(keyword_to_coin)
df.to_parquet(output_file)
print(f"Embeddings gespeichert unter: {output_file}")
else:
print(f"Datei existiert bereits: {output_file}. Lade die Datei...")
df = pd.read_parquet(output_file)
print("DataFrame erfolgreich geladen.")
config.json: 0%| | 0.00/1.18k [00:00<?, ?B/s]
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\huggingface_hub\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\Users\can-e\.cache\huggingface\hub\models--jinaai--jina-embeddings-v2-small-en. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations. To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development warnings.warn(message)
model.safetensors: 0%| | 0.00/65.4M [00:00<?, ?B/s]
tokenizer_config.json: 0%| | 0.00/373 [00:00<?, ?B/s]
vocab.txt: 0%| | 0.00/232k [00:00<?, ?B/s]
tokenizer.json: 0%| | 0.00/712k [00:00<?, ?B/s]
special_tokens_map.json: 0%| | 0.00/125 [00:00<?, ?B/s]
Datei existiert bereits: ../data/processed/x_posts_embeddings.parquet. Lade die Datei... DataFrame erfolgreich geladen.
df.head(5)
| username | tweet | timestamp | likes | impressions | comments | reposts | bookmarks | searchkeyword | embedding | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | @officialmcafee | Bitcoin now at $16,600.00. Those of you in the... | 2017-12-08T01:09:27.000Z | 6200.0 | 0.0 | 1259.0 | 4518.0 | 486.0 | Bitcoin | [-0.89842653, -0.46295443, 0.3335528, 0.286954... |
| 1 | @CharlieShrem | Time to buy #bitcoin | 2018-02-01T16:37:51.000Z | 17000.0 | 0.0 | 469.0 | 6858.0 | 16.0 | Bitcoin | [-0.6538949, -0.40458962, 0.34390834, 0.860342... |
| 2 | @rogerkver | I just bought 50 Bitcoin (BCH) ATM machines fo... | 2018-02-13T00:53:30.000Z | 2000.0 | 0.0 | 778.0 | 701.0 | 2.0 | Bitcoin | [-0.6145951, -0.06576121, -0.09537013, 0.43106... |
| 3 | @rogerkver | Who wants a Bitcoin Cash Visa debit card? htt... | 2017-12-11T16:12:51.000Z | 4300.0 | 0.0 | 765.0 | 1929.0 | 2.0 | Bitcoin | [-0.16293974, -0.23298289, 0.1674211, 0.296802... |
| 4 | @ARealHyena | I liked a \n@YouTube\n video http://youtu.be/i... | 2018-02-27T23:59:37.000Z | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Bitcoin | [-0.43297186, 0.010015366, -0.025700409, 0.749... |
df.columns
Index(['username', 'tweet', 'timestamp', 'likes', 'impressions', 'comments',
'reposts', 'bookmarks', 'searchkeyword', 'embedding'],
dtype='object')
coin_test = ['FTX Token', 'Safe Moon', 'Ethereum', 'Cosmos']
#cut out test coins
df = df[~df['searchkeyword'].isin(coin_test)]
with open('../data/raw/coins.json', 'r') as f:
coins_data = json.load(f)
coins_info_df = pd.DataFrame(coins_data)
merged_df = df.merge(coins_info_df, left_on="searchkeyword", right_on="name", how="left")
embeddings = np.vstack(merged_df["embedding"].values)
fraud_labels = merged_df["fraud"]
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)
umap_reducer = umap.UMAP(n_components=2, random_state=42, metric='cosine')
embeddings_umap = umap_reducer.fit_transform(embeddings)
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
colors = ['red' if fraud else 'blue' for fraud in fraud_labels]
axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], c=colors, alpha=0.25, edgecolor='k')
axes[0].set_title("PCA of Embeddings")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")
axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=colors, alpha=0.15, edgecolor='k')
axes[1].set_title("UMAP of Embeddings")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")
fraud_legend = [
plt.Line2D([0], [0], marker='o', color='w', label='Fraud', markerfacecolor='red', markersize=10),
plt.Line2D([0], [0], marker='o', color='w', label='Non-Fraud', markerfacecolor='blue', markersize=10)
]
fig.legend(handles=fraud_legend, loc="upper right")
plt.tight_layout()
plt.show()
Here we can see the embeddings plottet as pca and umap top 2 componnents. The color represents if the tweet was about a scam or non scam coin.There are clusters of Scam and non-Scam Embeddings visible, but it could be that these are just Tweets about the same coin.
coin_labels = merged_df["searchkeyword"].values
unique_coins = np.unique(coin_labels)
coin_colors = {coin: plt.cm.tab10(i / len(unique_coins)) for i, coin in enumerate(unique_coins)}
colors = [coin_colors[coin] for coin in coin_labels]
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], c=colors, alpha=0.1, edgecolor='k')
axes[0].set_title("PCA of Embeddings")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")
axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=colors, alpha=0.1, edgecolor='k')
axes[1].set_title("UMAP of Embeddings")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")
legend_elements = [
plt.Line2D([0], [0], marker='o', color='w', label=coin, markerfacecolor=coin_colors[coin], markersize=10)
for coin in unique_coins
]
fig.legend(handles=legend_elements, loc="upper right", title="Coins")
plt.tight_layout()
plt.show()
Here we can see the same plot but with the color representing the coin. We can see that the embeddings are clustered by coin. So its not directly possible to say that the embeddings are clustered by scam or non scam.
unique_keywords = df['searchkeyword'].unique()
for keyword in unique_keywords:
keyword_df = df[df['searchkeyword'] == keyword]
embeddings = np.vstack(keyword_df['embedding'].values)
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)
umap_reducer = umap.UMAP(n_components=2, metric='cosine', random_state=42)
embeddings_umap = umap_reducer.fit_transform(embeddings)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], alpha=0.7, edgecolor='k')
axes[0].set_title(f"PCA of Embeddings for {keyword}")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")
axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], alpha=0.7, edgecolor='k')
axes[1].set_title(f"UMAP of Embeddings for {keyword}")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")
plt.tight_layout()
plt.show()
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
c:\Users\can-e\PycharmProjects\main\.venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
here we can see the same but for every coin by itself. Scam Coins tend to have a more dense cluster of embeddings, but then many clusters. Non Scam coins have only one big cluster but with many outliers. But its also just an tendency and some scam coins have only one cluster and some non scam coins have many clusters.
unique_keywords = df['searchkeyword'].unique()
def calculate_metrics(embeddings):
pairwise_distances = pdist(embeddings, metric='cosine')
avg_pairwise_distance = pairwise_distances.mean()
knn = NearestNeighbors(n_neighbors=12)
knn.fit(embeddings)
distances, _ = knn.kneighbors(embeddings)
avg_local_density = distances.mean()
multivariate_std = np.std(embeddings, axis=0).mean()
return avg_pairwise_distance, avg_local_density, multivariate_std
with open('../data/raw/coins.json', 'r') as f:
coins_data = json.load(f)
coins_info_df = pd.DataFrame(coins_data)
merged_df = df.merge(coins_info_df[['name', 'fraud']], left_on="searchkeyword", right_on="name", how="left")
if 'fraud' not in merged_df.columns:
print("Die 'fraud'-Spalte konnte nicht hinzugefügt werden. Bitte überprüfen Sie die Zuordnung.")
unique_keywords = merged_df['searchkeyword'].unique()
metrics_data = []
for keyword in unique_keywords:
keyword_df = merged_df[merged_df['searchkeyword'] == keyword]
fraud_values = keyword_df[keyword_df['fraud'] == True]['embedding'].values
non_fraud_values = keyword_df[keyword_df['fraud'] == False]['embedding'].values
if len(fraud_values) > 0:
fraud_embeddings = np.vstack(fraud_values)
fraud_metrics = calculate_metrics(fraud_embeddings)
else:
fraud_metrics = (0, 0, 0)
if len(non_fraud_values) > 0:
non_fraud_embeddings = np.vstack(non_fraud_values)
non_fraud_metrics = calculate_metrics(non_fraud_embeddings)
else:
non_fraud_metrics = (0, 0, 0)
metrics_data.append({
'keyword': keyword,
'fraud': fraud_metrics,
'non_fraud': non_fraud_metrics
})
fig, axes = plt.subplots(3, 1, figsize=(12, 18))
metric_names = ['Average Pairwise Distance', 'Average Local Density', 'Multivariate Standard Deviation']
for i, metric_name in enumerate(metric_names):
fraud_values = [entry['fraud'][i] for entry in metrics_data]
non_fraud_values = [entry['non_fraud'][i] for entry in metrics_data]
keywords = [entry['keyword'] for entry in metrics_data]
bar_width = 0.4
x = np.arange(len(keywords))
axes[i].bar(x - bar_width / 2, fraud_values, width=bar_width, label='Fraud', color='red', alpha=0.7)
axes[i].bar(x + bar_width / 2, non_fraud_values, width=bar_width, label='Non-Fraud', color='blue', alpha=0.7)
axes[i].set_title(metric_name)
axes[i].set_xticks(x)
axes[i].set_xticklabels(keywords, rotation=45, ha='right')
axes[i].set_ylabel('Value')
axes[i].legend()
plt.tight_layout()
plt.show()
In these plots we can see some Metrics of the Embeddings like Average Pairwise distance or Density... Its visible, that scam coins are a little bit denser but its not that much of a difference and not a overall rule.
def max_similar_embeddings_normalized(embeddings, similarity_threshold=0.95):
similarity_matrix = cosine_similarity(embeddings)
np.fill_diagonal(similarity_matrix, 0)
similar_counts = np.sum(similarity_matrix > similarity_threshold, axis=1)
return similar_counts.max() / len(embeddings)
unique_keywords = merged_df['searchkeyword'].unique()
similarity_data = []
for keyword in unique_keywords:
keyword_df = merged_df[merged_df['searchkeyword'] == keyword]
embeddings = np.vstack(keyword_df['embedding'].values)
max_similar_count_normalized = max_similar_embeddings_normalized(embeddings, similarity_threshold=0.9)
similarity_data.append({
'keyword': keyword,
'max_similar_embeddings_normalized': max_similar_count_normalized
})
sorted_data = sorted(
similarity_data,
key=lambda x: x['max_similar_embeddings_normalized'],
reverse=True
)
sorted_keywords = [entry['keyword'] for entry in sorted_data]
sorted_max_similar_values_normalized = [
entry['max_similar_embeddings_normalized'] for entry in sorted_data
]
sorted_is_scam = [
coins_info_df[coins_info_df['name'] == keyword]['fraud'].iloc[0]
for keyword in sorted_keywords
]
sorted_colors = ['red' if scam else 'blue' for scam in sorted_is_scam]
plt.figure(figsize=(12, 6))
plt.bar(
sorted_keywords,
sorted_max_similar_values_normalized,
color=sorted_colors,
alpha=0.7
)
plt.title("Normierte maximale Anzahl ähnlicher Embeddings pro Coin (nach Wert sortiert)")
plt.xlabel("Coin")
plt.ylabel("Normierte maximale Anzahl ähnlicher Embeddings")
plt.xticks(rotation=45, ha='right')
red_patch = mpatches.Patch(color='red', label='Scam')
blue_patch = mpatches.Patch(color='blue', label='Non-Scam')
plt.legend(handles=[red_patch, blue_patch])
plt.tight_layout()
plt.show()
Here we can see the max embeddings which are near to ech other (cosine sim over 0.95) normalised with count of all embeddings. It is very clear visible, that Scam coins tend to have more similar embeddings than non scam coins. But again its not a very clear rule (Avalanche)